import requests
from lxml import etree
from 爬取姓名 import Name
headers = {
        'accept-language': 'zh-CN,zh;q=0.9',
        'cache-control': 'max-age=0',
        'cookie': 'Hm_lvt_c3acb27768b401b6598a1ae2797371a4=1693548649,1693581545; '
                  'Hm_lpvt_c3acb27768b401b6598a1ae2797371a4=1693581548',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/'
                      '537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/'
                      '537.36 SLBrowser/7.0.0.6251 SLBChan/124'}

xianyou_url = Name('https://yzb.bupt.edu.cn/content/content.php?p=8_4_70')

# 首先，得到全部研究生导师的名字
xianyou_all_yanjiushengtea_name=xianyou_url.check_name()

#得到全部导师的名字
s1=Name("https://smp.bupt.edu.cn/szdw.htm")
ss1=s1.get_all_name('//div[@class="main_rpicR"]/h3/a/text()')

s2=Name("https://smp.bupt.edu.cn/szdw/5.htm")
ss2=s2.get_all_name('//div[@class="main_rpicR"]/h3/a/text()')

s3=Name("https://smp.bupt.edu.cn/szdw/4.htm")
ss3=s3.get_all_name('//div[@class="main_rpicR"]/h3/a/text()')

s4=Name("https://smp.bupt.edu.cn/szdw/3.htm")
ss4=s4.get_all_name('//div[@class="main_rpicR"]/h3/a/text()')

s5=Name("https://smp.bupt.edu.cn/szdw/2.htm")
ss5=s5.get_all_name('//div[@class="main_rpicR"]/h3/a/text()')

s6=Name("https://smp.bupt.edu.cn/szdw/1.htm")
ss6=s5.get_all_name('//div[@class="main_rpicR"]/h3/a/text()')

# 爬取所有老师的个人链接
x1=Name("https://smp.bupt.edu.cn/szdw.htm")
xx1=s1.get_all_name('//div[@class="main_rpicR"]/h3/a/@href')

x2=Name("https://smp.bupt.edu.cn/szdw/5.htm")
xx2=s2.get_all_name('//div[@class="main_rpicR"]/h3/a/@href')

x3=Name("https://smp.bupt.edu.cn/szdw/4.htm")
xx3=s3.get_all_name('//div[@class="main_rpicR"]/h3/a/@href')

x4=Name("https://smp.bupt.edu.cn/szdw/3.htm")
xx4=s4.get_all_name('//div[@class="main_rpicR"]/h3/a/@href')

x5=Name("https://smp.bupt.edu.cn/szdw/2.htm")
xx5=s5.get_all_name('//div[@class="main_rpicR"]/h3/a/@href')

x6=Name("https://smp.bupt.edu.cn/szdw/1.htm")
xx6=s5.get_all_name('//div[@class="main_rpicR"]/h3/a/@href')
# xianyou_Teaher_url = xx1+xx2+xx3+xx4+xx5+xx6
xx1_real = []
for i in range(len(xx1)):
    xx1_real.append('https://smp.bupt.edu.cn/'+xx1[i])
xx2_real = []
for i in range(len(xx2)):
    xx2_real.append('https://smp.bupt.edu.cn/'+xx2[i])
xx3_real = []
for i in range(len(xx3)):
    xx3_real.append('https://smp.bupt.edu.cn/'+xx3[i])
xx4_real = []
for i in range(len(xx4)):
    xx4_real.append('https://smp.bupt.edu.cn/'+xx4[i])
xx5_real = []
for i in range(len(xx5)):
    xx5_real.append('https://smp.bupt.edu.cn/'+xx5[i])
xx6_real = []
for i in range(len(xx6)):
    xx6_real.append('https://smp.bupt.edu.cn/'+xx6[i])

#将导师名字与链接进行匹配

xianyou_Name_url_dic1 = {}
for i in range(len(xx1_real)):
    xianyou_Name_url_dic1.update({ss1[i]:xx1_real[i]})

xianyou_Name_url_dic2 = {}
for i in range(len(xx2_real)):
    xianyou_Name_url_dic2.update({ss2[i]:xx2_real[i]})

xianyou_Name_url_dic3 = {}
for i in range(len(xx3_real)):
    xianyou_Name_url_dic3.update({ss3[i]:xx3_real[i]})

xianyou_Name_url_dic4 = {}
for i in range(len(xx4_real)):
    xianyou_Name_url_dic4.update({ss4[i]:xx4_real[i]})

xianyou_Name_url_dic5 = {}
for i in range(len(xx5_real)):
    xianyou_Name_url_dic5.update({ss5[i]:xx5_real[i]})

xianyou_Name_url_dic6 = {}
for i in range(len(xx6_real)):
    xianyou_Name_url_dic6.update({ss6[i]:xx6_real[i]})
xianyou_Name_url_dic1.update(xianyou_Name_url_dic2)
xianyou_Name_url_dic1.update(xianyou_Name_url_dic3)
xianyou_Name_url_dic1.update(xianyou_Name_url_dic4)
xianyou_Name_url_dic1.update(xianyou_Name_url_dic5)
xianyou_Name_url_dic1.update(xianyou_Name_url_dic6)

for name in xianyou_Name_url_dic1:
    flag = 0
    for namer in xianyou_all_yanjiushengtea_name:
        if name != namer:
            flag+=1
        else:
            pass
    if flag == 0:
        del xianyou_Name_url_dic1[name]
    else:
        pass

# 开始爬取
xianyou_gerenxinxi_list=[]
xianyou_email_list=[]
xianyou_img_list=[]
for url in xianyou_Name_url_dic1.values():
# url是每一个研究生导师的个人链接

    # 获取响应,得到响应数据
    response = requests.get(url=url, headers=headers)
    response.encoding='utf-8'
    content=response.text
    #使用xpath
    tree=etree.HTML(content)
    gerenxinxi=tree.xpath('//div[@align="center"]/table/tbody/tr[2]/td/p/span/text() | //'
                          'div[@class="v_news_content"]/table/tbody/tr[2]/td/p/span/text()')
    xianyou_gerenxinxi_list.append(gerenxinxi)

    email=tree.xpath('//div[@align="center"]//p//text()')
    xianyou_email_list.append(email)
xianyou_email_list_real=[]
# 得到电子邮件
for email in xianyou_email_list:
    if len(email)==0:
        xianyou_email_list_real.append('待查')

    else:
        for email_i in email:
            if "@" in email_i or "bupt" in email_i:
                xianyou_email_list_real.append(email_i)


#图片
response = requests.get(url='https://smp.bupt.edu.cn/szdw.htm', headers=headers)
response.encoding='utf-8'
content=response.text
#使用xpath
tree=etree.HTML(content)
img1=tree.xpath('//div[@class="main_conRCa"]/ul/li/div/a/img/@src')

response = requests.get(url='https://smp.bupt.edu.cn/szdw/5.htm', headers=headers)
response.encoding='utf-8'
content=response.text
#使用xpath
tree=etree.HTML(content)
img2=tree.xpath('//div[@class="main_conRCa"]/ul/li/div/a/img/@src')

response = requests.get(url='https://smp.bupt.edu.cn/szdw/4.htm', headers=headers)
response.encoding='utf-8'
content=response.text
#使用xpath
tree=etree.HTML(content)
img3=tree.xpath('//div[@class="main_conRCa"]/ul/li/div/a/img/@src')

response = requests.get(url='https://smp.bupt.edu.cn/szdw/3.htm', headers=headers)
response.encoding='utf-8'
content=response.text
#使用xpath
tree=etree.HTML(content)
img4=tree.xpath('//div[@class="main_conRCa"]/ul/li/div/a/img/@src')

response = requests.get(url='https://smp.bupt.edu.cn/szdw/2.htm', headers=headers)
response.encoding='utf-8'
content=response.text
#使用xpath
tree=etree.HTML(content)
img5=tree.xpath('//div[@class="main_conRCa"]/ul/li/div/a/img/@src')

response = requests.get(url='https://smp.bupt.edu.cn/szdw/1.htm', headers=headers)
response.encoding='utf-8'
content=response.text
#使用xpath
tree=etree.HTML(content)
img6=tree.xpath('//div[@class="main_conRCa"]/ul/li/div/a/img/@src')
img=img1+img2+img3+img4+img5+img6
for i in range(len(img)):
    img[i]='https://smp.bupt.edu.cn/'+img[i]

xianyou_Name_url_list=ss1+ss2+ss3+ss4+ss5+ss6
xianyou_gerenxinxi_list_real=[]
for sublist in xianyou_gerenxinxi_list:
        for val in sublist:
            xianyou_gerenxinxi_list_real.append(val)
# 输出顺序：姓名，个人简介，电子邮件，照片
print(xianyou_gerenxinxi_list)
